/*THIS IS THE CBP_CO_READER.DO FILE OF ACEMOGLU, AUTOR, DORN, HANSON, AND PRICE (JLE, 2016)
DOWNLOADED FROM https://www.ddorn.net/data/AADHP-GreatSag-FileArchive.zip
MAKES EXTENSIVE USE OF CROSSWALKS FILES IN XWALKS FOLDER (ALSO FROM AADHP)
CHANGES: DIRECTORIES, EXTENDED TO 2016, ADDS IMPUTED ANNUAL AND QUARTERLY
PAYROLLS, AGGREGATES AT THE CZONE-STATE LEVEL
*/


/*
	Description: Cleans the county-level County Business Patterns data files

	Author: Brendan Price
	Updated: July 31, 2014
*/


include directories_build.do


/* Script for allocating aggregate NAICS industries to subordinate 6-digit industries, separately by county */

capture program drop allocate_naics
program define allocate_naics
	* Allocate aggregate industries recursively
	foreach v of numlist 5 4 3 2 0 {
		* Compute quantity of employment to be allocated, at the county x industry level
		bysort countyid naics_`v'd: egen alloc_emp = total(emp * (agg_level == `v')) if naics_`v'd != ""

		* Compute total employment/payroll in subordinate 6-digit industries, at the national level
		bysort naics_`v'd: egen level_emp = total(emp * (agg_level == 6)) if naics_`v'd != ""

		* Verify that recipient industries have positive employment
		assert level_emp > 0 & level_emp < . if alloc_emp > 0 & alloc_emp < .

		* Compute each 6-digit industry's share of employment/payroll in the superordinate industry
		gen level_share = emp / level_emp if agg_level == 6

		* Allocate employment/payroll on the basis of shares
		replace emp = emp + (alloc_emp * level_share) if agg_level == 6
		
		* Zero out the industries that were just allocated
		replace emp = 0 if agg_level == `v'
	
		* Cleanup
		drop alloc_emp level_emp level_share
	}

	assert agg_level == 6 if abs(emp) > 1
	*keep if agg_level == 6
end

/* Process raw county-level CBP data files */

foreach year of numlist 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 {

	* Specify industry variable: SIC codes used through 1997, NAICS thereafter
	if `year' <= 1997 {
		local indvar "sic"
		local maxlevel = 4
	}
	else {
		local indvar "naics"
		local maxlevel = 6
	}

	* Extract 2-digit year
	local y = substr("`year'", 3, 2)

	* Load raw county-level file
	insheet using "`rawdir'`s'cbp`y'co.txt", comma clear 

	* Use FIPS codes (not Census codes) for state and county
	drop censtate cencty
	
	* Disregard payroll variables
	*drop qp1 ap
	
	* The 1999 file has a few duplicates (in one case, containing conflicting information)
	if `year' == 1999 {
		duplicates drop
		drop if fipstate == 1 & fipscty == 45 & naics == "62111/" & empflag == "E"
	}

	* Verify that state x county x industry is a unique identifier
	bysort fipstate fipscty `indvar': assert _N == 1

	* Exclude observations corresponding to state-wide totals
	drop if fipscty == 999

	* Create county codes mergeable with Dorn's county-to-CZ crosswalk
	assert fipstate >= 1 & fipstate <= 56
	assert fipscty >= 1 & fipscty <= 999
	gen countyid = (fipstate * 1000) + fipscty
	label var countyid "Unique county identifier"
	
	* Label remaining variables
	label var emp "Original reported employment value"
	label var empflag "Flag for original reported employment value"
	
	drop fipstate fipscty
	order countyid `indvar'

	* Establishment totals should add up
	assert est == n1_4 + n5_9 + n10_19 + n20_49 + n50_99 + n100_249 + n250_499 + n500_999 + n1000
	assert n1000 == n1000_1 + n1000_2 + n1000_3 + n1000_4
	drop n1000

	* Rename establishment subcounts for ease of looping
	rename n1_4 estab1
	rename n5_9 estab2
	rename n10_19 estab3
	rename n20_49 estab4
	rename n50_99 estab5
	rename n100_249 estab6
	rename n250_499 estab7
	rename n500_999 estab8
	rename n1000_1 estab9
	rename n1000_2 estab10
	rename n1000_3 estab11
	rename n1000_4 estab12
	
	
	label var estab1 "0-4"
	label var estab2 "5-9"
	label var estab3 "10-19"
	label var estab4 "20-49"
	label var estab5 "50-99"
	label var estab6 "100-249"
	label var estab7 "250-499"
	label var estab8 "500-999"
	label var estab9 "1000-1499"
	label var estab10 "1500-2499"
	label var estab11 "2500-4999"
	label var estab12 "5000+"

		display "first"
		
	* Process codes at different levels of aggregation
	if "`indvar'" == "sic" {
		* Verify that SIC codes are at the 4-digit level
		assert length(sic) == 4

	
		
		* The 1988 file contains a few observations (with tiny employment counts) with code 5399, which isn't used elsewhere
		if `year' == 1988 {
			drop if sic == "5399"
		}

		* Merge in details on the SIC hierarchy and verify all codes are accounted for
		merge m:1 sic using "`nesting'sic87_nesting.dta", assert(2 3)
		keep if _merge == 3
		drop _merge
		
		* Rename for symmetry with NAICS variable names
		forvalues i = 0/4 {
			rename sic87_`i'd sic_`i'd
		}
	}
	else {
		* Verify that NAICS codes are at the 6-digit level
		assert length(naics) == 6

			display "second"
			
		* Merge in NAICS hierarchies and verify all codes are accounted for
		if `year' >= 1998 & `year' <= 2002 {
			merge m:1 naics using "`nestingdir'`s'naics97_nesting.dta", assert(2 3)
			keep if _merge == 3
			drop _merge
		}
		else if `year' >= 2003 & `year' <= 2007 {
			merge m:1 naics using "`nestingdir'`s'naics02_nesting.dta", assert(2 3)
			keep if _merge == 3
			drop _merge
		}
		else if `year' >= 2008 & `year' <= 2011 {
			merge m:1 naics using "`nestingdir'`s'naics07_nesting.dta", assert(2 3)
			keep if _merge == 3
			drop _merge
		}
		else if `year' >= 2012 & `year' <= 2016 {
			merge m:1 naics using "`nestingdir'`s'naics12_nesting.dta", assert(2 3)
			keep if _merge == 3
			drop _merge
		}
		else if `year' >= 2017 & `year' <= 2019 {
			merge m:1 naics using "`nestingdir'`s'naics17_nesting.dta", assert(2 3)
			keep if _merge == 3
			drop _merge
		}
		
		* NAICS hierarchy doesn't have 1-digit codes. Add vacuous codes so the script functions.
		gen naics_1d = naics_2d
	}

	/*
		Computing bounds for employment counts (1/2)

		1: The imputed employment of a county-industry cell has to lie within the indicated employment bracket
		2: Narrow down the employment range of a county-industry cell using the firm size distribution of the cell (unless the firm size distribution cannot be reconciled with the indicated employment bracket).
	*/

	* Compute lower and upper bounds according to empflag intervals
	gen lb_emp = emp
	gen ub_emp = emp

	replace lb_emp = 0 if empflag == "A"
	replace lb_emp = 20 if empflag == "B"
	replace lb_emp = 100 if empflag == "C"
	replace lb_emp = 250 if empflag == "E"
	replace lb_emp = 500 if empflag == "F"
	replace lb_emp = 1000 if empflag == "G"
	replace lb_emp = 2500 if empflag == "H"
	replace lb_emp = 5000 if empflag == "I"
	replace lb_emp = 10000 if empflag == "J"
	replace lb_emp = 25000 if empflag == "K"
	replace lb_emp = 50000 if empflag == "L"
	replace lb_emp = 100000 if empflag == "M"

	replace ub_emp = 19 if empflag == "A"
	replace ub_emp = 99 if empflag == "B"
	replace ub_emp = 249 if empflag == "C"
	replace ub_emp = 499 if empflag == "E"
	replace ub_emp = 999 if empflag == "F"
	replace ub_emp = 2499 if empflag == "G"
	replace ub_emp = 4999 if empflag == "H"
	replace ub_emp = 9999 if empflag == "I"
	replace ub_emp = 24999 if empflag == "J"
	replace ub_emp = 49999 if empflag == "K"
	replace ub_emp = 99999 if empflag == "L"
	replace ub_emp = 100000000 if empflag == "M"

	assert lb_emp <= ub_emp
	label var lb_emp "Employment lower bound based on empflag"
	label var ub_emp "Employment upper bound based on empflag"

	* Compute bounds according to establishment counts
	#delimit ;
	gen lb_fsize =
		(0 * estab1)
		+ (5 * estab2)
		+ (10 * estab3)
		+ (20 * estab4)
		+ (50 * estab5)
		+ (100 * estab6)
		+ (250 * estab7)
		+ (500 * estab8)
		+ (1000 * estab9)
		+ (1500 * estab10)
		+ (2500 * estab11)
		+ (5000 * estab12);
	
	gen ub_fsize =
		(4 * estab1)
		+ (9 * estab2)
		+ (19 * estab3)
		+ (49 * estab4)
		+ (99 * estab5)
		+ (249 * estab6)
		+ (499 * estab7)
		+ (999 * estab8)
		+ (1499 * estab9)
		+ (2499 * estab10)
		+ (4999 * estab11)
		+ (100000000 * estab12);
	#delimit cr
	
	assert lb_fsize <= ub_fsize
	label var lb_fsize "Employment lower bound based on establishment counts"
	label var ub_fsize "Employment upper bound based on establishment counts"

	* Check consistency by identifying cases where a lower bound exceeds an upper bound.
	* These observations will be excluded from the imputation regressions.
	gen deviation = 0
	replace deviation = (lb_emp - ub_fsize) if lb_emp > ub_fsize
	replace deviation = (ub_emp - lb_fsize) if lb_fsize > ub_emp
	gen error_consistency = (deviation != 0)
	drop deviation
	label var error_consistency "empflag and establishment count bounds do not agree"

	* Adjusted employment brackets
	gen lb_adj = lb_emp
	gen ub_adj = ub_emp

	* Use tighter firm size brackets, unless inconsistent with employment counts
	replace lb_adj = lb_fsize if lb_fsize > lb_adj & lb_fsize <= ub_adj
	replace ub_adj = ub_fsize if ub_fsize >= lb_adj & ub_fsize < ub_adj

	* When brackets do not overlap, choose "emp-based" count that is closest to being consistent with firm size brackets
	replace lb_adj = ub_adj if ub_adj < lb_fsize
	replace ub_adj = lb_adj if lb_adj > ub_fsize

	label var lb_adj "Adjusted employment lower bound using both empflag and establishment counts"
	label var ub_adj "Adjusted employment upper bound using both empflag and establishment counts"

	/*
		Computing bounds for employment counts (2/2)

		3: Narrow down the resulting employment range using the employment
		range that is implied by the aggregation of subindustries and employment
		in firms with missing subindustry designation (unless this information
		cannot be reconciled with the employment bracket of step 2)
	*/

	* Determine the level of each industry's "parent" industry (may differ by county)
	gen parent_level = .
	label var parent_level "Aggregation level of county x industry pair's most immediate parent industry"

	local v = `maxlevel' - 1
	while `v' >= 0 {
		* Determine if each industry has a v-digit "predecessor" in the county
		bysort countyid `indvar'_`v'd: egen has_level`v'_predecessor = total(agg_level == `v')
		assert has_level`v'_predecessor == 0 | has_level`v'_predecessor == 1
		replace has_level`v'_predecessor = . if agg_level <= `v'

		* Assign each industry its "parent" - that is, the most immediate "precessor" - by county
		replace parent_level = `v' if has_level`v'_predecessor == 1 & parent_level == .

		local v = `v' - 1
	}
	
		display "third"
		
	assert parent_level != . if agg_level >= 1
	assert parent_level < agg_level if agg_level >= 1
	drop has_level*_predecessor
	
		display "fourth"

	* For each county x industry, compute establishment counts in all "child industries" (tot_#_#)
	* Then compute establishment counts for firms with missing detailed industry codes
	forvalues z = 1/12 {
		gen subord_estab`z' = 0 if agg_level < `maxlevel'
		label var subord_estab`z' "# of class-`z' establishments in subordinate industries"
		
		gen miss_estab`z' = .
		label var miss_estab`z' "# of class-`z' establishments missing from subordinate industries"

		forvalues v = 0/`=`maxlevel' - 1' {
			* Determine number of establishments in subordinate industries
			bysort countyid `indvar'_`v'd: egen child_estabs = total(estab`z' * (parent_level == `v'))
			replace subord_estab`z' = child_estabs if agg_level == `v'
			drop child_estabs
			
			* Determine number of establishments in residual category
			bysort countyid `indvar'_`v'd: replace miss_estab`z' = (estab`z' - subord_estab`z') if agg_level == `v'
		}
		
		drop subord_estab`z'
	}
	
	* Identify data errors where "miss_estab#" < 0
	forvalues z = 1/12 {
		* Check fails occasionally for 1988-1990
		if `year' > 1990 {
			assert miss_estab`z' >= 0 if miss_estab`z' < .
		}
		else {
			replace miss_estab`z' = 0 if miss_estab`z' < 0
		}
	}
	
		display "fifth"
		
	* Aggregation of adjusted employment brackets
	gen lb_agg = 0
	gen ub_agg = 0
	gen error_aggregation = 0
	
	label var lb_agg "Employment lower bound based on aggregation of subordinate bounds"
	label var ub_agg "Employment upper bound based on aggregation of subordinate bounds"
	label var error_aggregation "Adjusted bounds disagree with aggregation bounds"
	
	local v = `maxlevel' - 1
	while `v' >= 0 {
		* Add up employment bounds in child industries
		bysort countyid `indvar'_`v'd: egen lb`v'_agg = total(lb_adj * (parent_level == `v'))
		replace lb_agg = lb_agg + lb`v'_agg if agg_level == `v'
		drop lb`v'_agg
		
		bysort countyid `indvar'_`v'd: egen ub`v'_agg = total(ub_adj * (parent_level == `v'))
		replace ub_agg = ub_agg + ub`v'_agg if agg_level == `v'
		drop ub`v'_agg

		* Combine with employment bounds in establishments with missing detailed industry code
		#delimit ;
		replace lb_agg = lb_agg
			+ (0 * miss_estab1)
			+ (5 * miss_estab2)
			+ (10 * miss_estab3)
			+ (20 * miss_estab4)
			+ (50 * miss_estab5)
			+ (100 * miss_estab6)
			+ (250 * miss_estab7)
			+ (500 * miss_estab8)
			+ (1000 * miss_estab9)
			+ (1500 * miss_estab10)
			+ (2500 * miss_estab11)
			+ (5000 * miss_estab12)
		if agg_level == `v';

		replace ub_agg = ub_agg
			+ (4 * miss_estab1)
			+ (9 * miss_estab2)
			+ (19 * miss_estab3)
			+ (49 * miss_estab4)
			+ (99 * miss_estab5)
			+ (249 * miss_estab6)
			+ (499 * miss_estab7)
			+ (999 * miss_estab8)
			+ (1499 * miss_estab9)
			+ (2499 * miss_estab10)
			+ (4999 * miss_estab11)
			+ (100000000 * miss_estab12)
		if agg_level == `v';
		#delimit cr
		
		* Flag cases where brackets do not overlap
		replace error_aggregation = 1 if (ub_adj < lb_agg) & (agg_level == `v')
		replace error_aggregation = 1 if (lb_adj > ub_agg) & (agg_level == `v')

		* Use tighter aggregation bounds, unless inconsistent with adjusted bounds
		replace lb_adj = lb_agg if (lb_agg > lb_adj) & (lb_agg <= ub_adj) & (agg_level == `v')
		replace ub_adj = ub_agg if (ub_agg >= lb_adj) & (ub_agg < ub_adj) & (agg_level == `v')

		* Set adjusted bracket to corner value when brackets do not overlap
		replace lb_adj = ub_adj if (ub_adj < lb_agg) & (agg_level == `v')
		replace ub_adj = lb_adj if (lb_adj > ub_agg) & (agg_level == `v')

		local v = `v' - 1
	}

	* Aggregation bounds are no longer needed
	drop lb_agg ub_agg

	/* Recursive estimation of typical employment size per establishment size bracket */

	* Imputed employment count
	gen imp_emp = .
	label var imp_emp "Imputed employment count"

	* Start values for employment per firm size bracket are equal to bracket midpoints
	local imp0_cat1 = 9.5
	local imp0_cat2 = 59.5
	local imp0_cat3 = 174.5
	local imp0_cat4 = 374.5
	local imp0_cat5 = 749.5
	local imp0_cat6 = 1749.5
	local imp0_cat7 = 3749.5
	local imp0_cat8 = 7499.5
	local imp0_cat9 = 17499.5
	local imp0_cat10 = 37499.5
	local imp0_cat11 = 74999.5
	local imp0_cat12 = 150000
	
	forvalues z = 1/12 {
		local imp1_cat`z' = `imp0_cat`z''
	}

	* Imputation error
	local x = 100000

	while `x' >= .1 {
		forvalues z = 1/12 {
			local imp0_cat`z' = `imp1_cat`z''
		}

		* Employment per detailed industry based on establishment size distribution and constrained by employment brackets
		#delimit ;
		replace imp_emp =
			estab1 * `imp0_cat1'
			+ estab2 * `imp0_cat2'
			+ estab3 * `imp0_cat3'
			+ estab4 * `imp0_cat4'
			+ estab5 * `imp0_cat5'
			+ estab6 * `imp0_cat6'
			+ estab7 * `imp0_cat7'
			+ estab8 * `imp0_cat8'
			+ estab9 * `imp0_cat9'
			+ estab10 * `imp0_cat10'
			+ estab11 * `imp0_cat11'
			+ estab12 * `imp0_cat12'
		if agg_level == `maxlevel';
		#delimit cr

		replace imp_emp = lb_adj if imp_emp < lb_adj & agg_level == `maxlevel'
		replace imp_emp = ub_adj if imp_emp > ub_adj & agg_level == `maxlevel'
		
		* Regress imputed employment on establishment size distribution, using "good" disaggregate observations
		quietly reg imp_emp estab* if (agg_level == `maxlevel') & (error_consistency == 0), noconstant

		forvalues z = 1/12 {
			local imp1_cat`z' = _b[estab`z']
		}
		
		* Compute absolute sum of coefficient changes
		#delimit ;
		local x =
			abs(`imp1_cat1' - `imp0_cat1')
			+ abs(`imp1_cat2' - `imp0_cat2')
			+ abs(`imp1_cat3' - `imp0_cat3')
			+ abs(`imp1_cat4' - `imp0_cat4')
			+ abs(`imp1_cat5' - `imp0_cat5')
			+ abs(`imp1_cat6' - `imp0_cat6')
			+ abs(`imp1_cat7' - `imp0_cat7')
			+ abs(`imp1_cat8' - `imp0_cat8')
			+ abs(`imp1_cat9' - `imp0_cat9')
			+ abs(`imp1_cat10' - `imp0_cat10')
			+ abs(`imp1_cat11' - `imp0_cat11')
			+ abs(`imp1_cat12' - `imp0_cat12');
	   #delimit cr
	}
	
	* Verify that estimated establishment sizes fall within the appropriate intervals
	assert `imp0_cat1' >= 0 & `imp0_cat1' <= 4
	assert `imp0_cat2' >= 5 & `imp0_cat2' <= 9
	assert `imp0_cat3' >= 10 & `imp0_cat3' <= 19
	assert `imp0_cat4' >= 20 & `imp0_cat4' <= 49
	assert `imp0_cat5' >= 50 & `imp0_cat5' <= 99
	assert `imp0_cat6' >= 100 & `imp0_cat6' <= 249
	assert `imp0_cat7' >= 250 & `imp0_cat7' <= 499
	assert `imp0_cat8' >= 500 & `imp0_cat8' <= 999
	assert `imp0_cat9' >= 1000 & `imp0_cat9' <= 1499
	assert `imp0_cat10' >= 1500 & `imp0_cat10' <= 2499
	assert `imp0_cat11' >= 2500 & `imp0_cat11' <= 4999
	assert `imp0_cat12' >= 5000
	
	* Verify that we continue to use observed employment counts wherever possible
	assert imp_emp == emp if (agg_level == `maxlevel') & (empflag == "")

		display "sixth"
		
	/*
		Impute employment in firms missing detailed industry codes.
		Compute bounds for these missing establishments too.
	*/

	* Use point estimates to impute employment in "missing" level-v establishments
	#delimit ;
	gen miss_emp =
		miss_estab1 * `imp0_cat1'
		+ miss_estab2 * `imp0_cat2'
		+ miss_estab3 * `imp0_cat3'
		+ miss_estab4 * `imp0_cat4'
		+ miss_estab5 * `imp0_cat5'
		+ miss_estab6 * `imp0_cat6'
		+ miss_estab7 * `imp0_cat7'
		+ miss_estab8 * `imp0_cat8'
		+ miss_estab9 * `imp0_cat9'
		+ miss_estab10 * `imp0_cat10'
		+ miss_estab11 * `imp0_cat11'
		+ miss_estab12 * `imp0_cat12'
	if agg_level < `maxlevel';
	#delimit cr

	* Bound employment in "missing" establishments
	#delimit ;
	gen lb_miss =
		miss_estab1 * 0
		+ miss_estab2 * 5
		+ miss_estab3 * 10
		+ miss_estab4 * 20
		+ miss_estab5 * 50
		+ miss_estab6 * 100
		+ miss_estab7 * 250
		+ miss_estab8 * 500
		+ miss_estab9 * 1000
		+ miss_estab10 * 1500
		+ miss_estab11 * 2500
		+ miss_estab12 * 5000
	if agg_level < `maxlevel';
	#delimit cr
	
	#delimit ;
	gen ub_miss =
		miss_estab1 * 4
		+ miss_estab2 * 9
		+ miss_estab3 * 19
		+ miss_estab4 * 49
		+ miss_estab5 * 99
		+ miss_estab6 * 249
		+ miss_estab7 * 499
		+ miss_estab8 * 999
		+ miss_estab9 * 1499
		+ miss_estab10 * 2499
		+ miss_estab11 * 4999
		+ miss_estab12 * 10000000
	if agg_level < `maxlevel';
	#delimit cr

	assert miss_emp >= lb_miss
	assert miss_emp <= ub_miss

		display "seventh"
		
	label var miss_emp "Imputed employment in establishments missing detailed industry code"
	label var lb_miss "Employment lower bound for establishments missing detailed industry code"
	label var ub_miss "Employment upper bound for establishments missing detailed industry code"

	* Impute "unbounded" employment in higher-level industries: "missing" employment plus sum of subordinate employment counts
	gen imp_emp_raw = miss_emp if agg_level < `maxlevel'
	
	local v = `maxlevel' - 1
	while `v' >= 0 {
		* Add employment in subordinate industries
		bysort county `indvar'_`v'd: egen child_emp = total(imp_emp * (parent_level == `v'))
		replace imp_emp_raw = imp_emp_raw + child_emp if agg_level == `v'
		drop child_emp
		
		* Constrain imputed employment to lie within industry employment brackets
		replace imp_emp = imp_emp_raw if agg_level == `v'
		replace imp_emp = lb_adj if (lb_adj > imp_emp_raw) & (agg_level == `v')
		replace imp_emp = ub_adj if (ub_adj < imp_emp_raw) & (agg_level == `v')

		local v = `v' - 1
	}

	* Double check all imputed employment values
	assert imp_emp >= lb_adj
	assert imp_emp <= ub_adj
	assert imp_emp == emp if empflag == ""
	
		display "ninth"

	/*
		Adjust imputed employment for hierarchical consistency.
		Essentially, we proportionally expand/shrink industries wherever we have "wiggle room."
		Industries for which we have exact counts are unaffected by this procedure.
		
		Note that we proceed from the top down to ensure correct employment counts at each level of aggregation.
	*/
	
	forvalues v = 0/`=`maxlevel' - 1' {
		* Absolute value of all deviations of imputed employment numbers from target values within level v
		egen tot_red_0 = total(abs((imp_emp_raw - imp_emp) * (agg_level == `v')))

		* Iterate to convergence
		local x = 2
		while `x' > 1 {
			* Recompute (unbounded) imputed employment, by level-v industry-county cell
			drop imp_emp_raw
			gen contribution = imp_emp if parent_level == `v'
			replace contribution = miss_emp if agg_level == `v'
			bysort countyid `indvar'_`v'd: egen imp_emp_raw = total(contribution)
			drop contribution

			* Required reduction of imputed lower-level employment counts, by level-v industry-county cell
			bysort countyid `indvar'_`v'd: egen tot_red = total((imp_emp_raw - imp_emp) * (agg_level == `v'))

			/*
				Compute employment in subindustries where upper/lower bound of employment bracket is not binding
				These "base" employment counts include both subordinate industries and missing establishments
				
				NOTE: The "contribution" code is clunky. Can probably be improved. Just mind the missing values!
			*/
			
			* Define base for industries that need to be reduced
			gen contribution = imp_emp if (parent_level == `v') & (imp_emp > lb_adj) & (tot_red > 0)
			replace contribution = miss_emp if (agg_level == `v') & (miss_emp > lb_miss) & (tot_red > 0)
			bysort countyid `indvar'_`v'd: egen base_red = total(contribution) if (parent_level == `v' | agg_level == `v')
			drop contribution
			
			* Define base for industries that need to be increased
			gen contribution = imp_emp if (parent_level == `v') & (imp_emp < ub_adj) & (tot_red < 0)
			replace contribution = miss_emp if (agg_level == `v') & (miss_emp < ub_miss) & (tot_red < 0)
			bysort countyid `indvar'_`v'd: egen base_inc = total(contribution) if (parent_level == `v' | agg_level == `v')
			drop contribution
						
			* Compute share of employment reduction that will be attributed to given industry.
			gen sh_red = 0 if (parent_level == `v' | agg_level == `v')
			replace sh_red = imp_emp / base_red if (imp_emp > lb_adj) & (tot_red > 0) & (base_red > 0) & (parent_level == `v')
			replace sh_red = miss_emp / base_red if (miss_emp > lb_miss) & (tot_red > 0) & (base_red > 0) & (agg_level == `v')

			* Compute share of employment increase (negative reduction) for given industry
			* Note: the only observations with zero imputed employment are those that do not have firms.
			* No employment should be attributed to these observations in this step.
			replace sh_red = imp_emp / base_inc if (imp_emp < ub_adj) & (tot_red < 0) & (base_inc > 0) & (parent_level == `v')
			replace sh_red = miss_emp / base_inc if (miss_emp < ub_miss) & (tot_red < 0) & (base_inc > 0) & (agg_level == `v')

			* Verify that shares sum to unity within each county x level-v industry
			#delimit ;
			bysort countyid `indvar'_`v'd: egen tot_sh = total(sh_red)
				if ((tot_red > 0 & base_red > 0) | (tot_red < 0 & base_inc > 0))
				& (parent_level == `v' | agg_level == `v');
			#delimit cr
			
			assert abs(1 - tot_sh) < .001 if tot_sh != .

			* Compute change attributed to subindustry
			gen chg = -(sh_red * tot_red) if tot_red != 0 & (sh_red > 0 & sh_red <= 1) & (parent_level == `v' | agg_level == `v')

			* Adjust imputed subindustry employment
			replace imp_emp = imp_emp + chg if parent_level == `v' & chg != .
			replace miss_emp = miss_emp + chg if agg_level == `v' & chg != .

			* Constrain imputed employment to lie within bounds
			replace imp_emp = lb_adj if imp_emp < lb_adj & parent_level == `v'
			replace imp_emp = ub_adj if imp_emp > ub_adj & parent_level == `v'
			
			replace miss_emp = lb_miss if miss_emp < lb_miss & agg_level == `v'
			replace miss_emp = ub_miss if miss_emp > ub_miss & agg_level == `v'

			* Recompute (unbounded) imputed employment, by level-v industry-county cell
			drop imp_emp_raw
			gen contribution = imp_emp if parent_level == `v'
			replace contribution = miss_emp if agg_level == `v'
			bysort countyid `indvar'_`v'd: egen imp_emp_raw = total(contribution)
			drop contribution
			
			* Absolute value of all deviations of imputed employment numbers from target values
			egen tot_red_1 = total(abs((imp_emp_raw - imp_emp) * (agg_level == `v')))
			local x = abs(tot_red_0 - tot_red_1)
			replace tot_red_0 = tot_red_1

			drop tot_red tot_red_1 base_red base_inc sh_red chg tot_sh
		}
		
		drop tot_red_0
	}
	
	* Verify that (1) all industries have imputed employment and (2) imputed = observed wherever we observe employment values.
	assert imp_emp != .
	assert imp_emp == emp if empflag == ""

	* Replace "true employment count" with imputed counterpart
	replace emp = imp_emp
	drop imp_emp imp_emp_raw
	label var emp "Original/imputed employment count"
	
	/*
		Apportion employment in establishments missing detailed industry codes.

		Let N be the most detailed level of aggregation.
		First, turn any observations at the "N-1"-digit level into "residual industries,"
		i.e. fictitious industries corresponding to any nested N-digit industries that do not appear in the data.
		Then do the same thing for observations that live at the "N-2"-digit level, and again for the "N-3"-digit level, etc.
		The end result is employment counts by county at the most detailed level.
	*/

	* Residualize employment in aggregate industries
	local v = `maxlevel' - 1
	while `v' >= 0 {
		* Compute employment/payroll within each level-v industry's subordinates
		bysort countyid `indvar'_`v'd: egen supord_emp = total(emp * (agg_level == `v'))
		bysort countyid `indvar'_`v'd: egen subord_emp = total(emp * (agg_level > `v'))
		
		* Up to 2006, "the whole should be greater than (or equal to) the sum of its parts" (up to rounding errors)
		* (This fails for one industry in 1990. For now, I subject 1990 to the same treatment as the post-2006 extracts.)
		if (`year' != 1990) & (`year' <= 2006) {
			assert (supord_emp >= subord_emp - 1) if agg_level == `v'
		}
		* Starting in 2007, this fails because the CBP noises up the data
		else {
			* Determine whether a level-v observation is present for each cluster
			bysort countyid `indvar'_`v'd: egen has_level`v'_obs = total(agg_level == `v')
			assert has_level`v'_obs == 0 | has_level`v'_obs == 1

			* Note that this is uniformly zero for 1-digit industries (which don't exist in the NAICS scheme)
			if (`year' != 1990) & (`v' == 1) {
				assert has_level`v'_obs == 0 
			}

			* If such an observation is present, compute excess employment in subindustries
			gen excess = subord_emp - supord_emp if (has_level`v'_obs == 1) & (agg_level > `v') & (supord_emp < subord_emp - 1) & (subord_emp < .)

			* Reduce subindustries proportionally to eliminate the excess (regardless of employment bounds)
			gen subord_sh = emp / subord_emp if (has_level`v'_obs == 1) & (agg_level > `v') & (excess > 0 & excess < .)
			replace emp = emp - (subord_sh * excess) if (has_level`v'_obs == 1) & (agg_level > `v') & (excess > 0 & excess < .)
			drop excess subord_sh
			
			* Check should now succeed
			drop supord_emp subord_emp
			bysort countyid `indvar'_`v'd: egen supord_emp = total(emp * (agg_level == `v'))
			bysort countyid `indvar'_`v'd: egen subord_emp = total(emp * (agg_level > `v'))	
			assert (supord_emp >= subord_emp - 1) if has_level`v'_obs == 1
			drop has_level`v'_obs
		}

		* Turn level-v observations into residual industries (zero out if the level-v industry is too small)
		replace emp = max(0, emp - subord_emp) if agg_level == `v'
		drop supord_emp subord_emp
		
		local v = `v' - 1
	}

	/*
		Exclude auxiliary and unclassified establishments.
		It is important to do this AFTER residualizing the aggregate industries;
		else employment in auxiliary establishments will be retained.
	*/

	if "`indvar'" == "sic" {
		drop if auxiliary == 1
		drop if sic == "99--"
	}
	else {
		drop if naics == "95----"
		drop if naics == "99----"
		assert naics_2d != "20" & naics_2d != "21"
	}

	* At this point, higher-level industries should consist solely of "missing" establishments.
	* The check fails for a single county in 1988; for now, I simply overlook this error.
	* The check also fails over 2007-2011, when artificial noise decouples the counts at levels 0 and 2
	tabstat emp miss_emp, by(agg_level)
	if (`year' != 1988) & `year' <= 2006 {
		assert abs(emp) <= 1 if agg_level == 0
		assert abs(miss_emp) <= 1 if agg_level == 0
	}
	
	
	
*******************************************************
replace qp1=0 if qp1==.
replace ap=0 if ap==.

if "`indvar'" == "naics" {
gen ratioapqp1=ap/qp1 if qp1 != 0 & ap !=0 & agg_level==6
}
if "`indvar'" == "sic" {
gen ratioapqp1=ap/qp1 if qp1 != 0 & ap !=0 & agg_level==4
}
egen medapqp1=median(ratioapqp1)
replace qp1=ap/medapqp1 if qp1==0 & ap !=0 & agg_level==0

foreach w in "qp1" "ap" {
gen `w'_new=`w'
}



***********************************************************
*********  IMPUTING QP1 AND AP for NAICS ******************


if `year' >= 1998 & `year' <= 2016 {
allocate_naics
}

if "`indvar'" == "naics" {

replace emp=0 if agg_level != 6
replace emp=0 if emp<.01

gen empall=emp 
	
foreach v of numlist 0 2 3 4 5 6 {
bysort countyid `indvar'_`v'd: egen emptot_`v'd = total(emp) if `indvar'_`v'd != ""
replace empall= emptot_`v'd if agg_level==`v'
}

foreach v of numlist 0 2 3 4 5 6  {
gen emp_`v'd=emptot_`v'd if agg_level==`v'
replace emp_`v'd=emptot_`v'd if parent_level==`v'-1 & agg_level==`v'+3
replace emp_`v'd=emptot_`v'd if parent_level==`v'-1 & agg_level==`v'+2
replace emp_`v'd=emptot_`v'd if parent_level==`v'-1 & agg_level==`v'+1
replace emp_`v'd=emptot_`v'd if parent_level==`v'-2 & agg_level==`v'+1
replace emp_`v'd=emptot_`v'd if parent_level==`v'-3 & agg_level==`v'+1
}

destring `indvar'_1d, gen(`indvar'_1d_)
destring `indvar'_2d, gen(`indvar'_2d_)
destring `indvar'_3d, gen(`indvar'_3d_)
destring `indvar'_4d, gen(`indvar'_4d_)
destring `indvar'_5d, gen(`indvar'_5d_)
destring `indvar'_6d, gen(`indvar'_6d_)

bysort countyid `indvar'_5d: egen `indvar'6dmin=min(`indvar'_6d_)
replace `indvar'6dmin=. if agg_level==5
gen emp_5d_= emp_5d
replace emp_5d_=. if `indvar'_6d_!= `indvar'6dmin
replace emp_5d=emp_5d_
drop emp_5d_
gen emp_4d_= emp_4d
bysort countyid `indvar'_4d: egen `indvar'5dmin=min(`indvar'_5d_)
replace `indvar'5dmin=. if agg_level==4
replace emp_4d_=. if `indvar'_5d_!= `indvar'5dmin
replace emp_4d=emp_4d_
drop emp_4d_
gen emp_3d_= emp_3d
bysort countyid `indvar'_3d: egen `indvar'4dmin=min(`indvar'_4d_)
replace `indvar'4dmin=. if agg_level==3
replace emp_3d_=. if `indvar'_4d_!= `indvar'4dmin
replace emp_3d=emp_3d_
drop emp_3d_
gen emp_2d_= emp_2d
bysort countyid `indvar'_2d: egen `indvar'3dmin=min(`indvar'_3d_)
replace `indvar'3dmin=. if agg_level==2
replace emp_2d_=. if `indvar'_3d_!= `indvar'3dmin
replace emp_2d=emp_2d_
drop emp_2d_


foreach w in "ap" "qp1" {

*From 0 to 1

***ERRROR BECAUSE PARENT LEVEL MATTERS TOO!
foreach v of numlist 0 2 3 4 5 6 {
gen `w'_`v'd=`w' if agg_level==`v'
replace `w'_`v'd=`w' if parent_level==`v'-1 & agg_level==`v'+3
replace `w'_`v'd=`w' if parent_level==`v'-1 & agg_level==`v'+2
replace `w'_`v'd=`w' if parent_level==`v'-1 & agg_level==`v'+1
replace `w'_`v'd=`w' if parent_level==`v'-2 & agg_level==`v'+1
replace `w'_`v'd=`w' if parent_level==`v'-3 & agg_level==`v'+1
}

gen `w'_5d_= `w'_5d
replace `w'_5d_=. if `indvar'_6d_!= `indvar'6dmin
replace `w'_5d=`w'_5d_
drop `w'_5d_
gen `w'_4d_= `w'_4d
replace `w'_4d_=. if `indvar'_5d_!= `indvar'5dmin
replace `w'_4d=`w'_4d_
drop `w'_4d_
gen `w'_3d_= `w'_3d
replace `w'_3d_=. if `indvar'_4d_!= `indvar'4dmin
replace `w'_3d=`w'_3d_
drop `w'_3d_
gen `w'_2d_= `w'_2d
replace `w'_2d_=. if `indvar'_3d_!= `indvar'3dmin
replace `w'_2d=`w'_2d_
drop `w'_2d_


gen `w'_6dmin=`w'_6d

bysort countyid `indvar'_5d: egen `w'_5dmin_=total(`w'_6d)
replace `w'_5dmin_=. if `w'_5d==.
gen `w'_5dmin=max(`w'_5dmin_,`w'_5d)
replace `w'_5d=`w'_5dmin if `w'_5d>0 & `w'_5d!=. & `w'_5d<`w'_5dmin

 
bysort countyid `indvar'_4d: egen `w'_4dmin_=total(`w'_5dmin)
replace `w'_4dmin_=. if `w'_4d==.
gen `w'_4dmin=max(`w'_4dmin_,`w'_4d)
replace `w'_4d=`w'_4dmin if `w'_4d>0 & `w'_4d!=. & `w'_4d<`w'_4dmin


bysort countyid `indvar'_3d: egen `w'_3dmin_=total(`w'_4dmin)
replace `w'_3dmin_=. if `w'_3d==.
gen `w'_3dmin=max(`w'_3dmin_,`w'_3d)
replace `w'_3d=`w'_3dmin if `w'_3d>0 & `w'_3d!=. & `w'_3d<`w'_3dmin

bysort countyid `indvar'_2d: egen `w'_2dmin_=total(`w'_3dmin)
replace `w'_2dmin_=. if `w'_2d==.
gen `w'_2dmin=max(`w'_2dmin_,`w'_2d)
replace `w'_2d=`w'_2dmin if `w'_2d>0 & `w'_2d!=. & `w'_2d<`w'_2dmin




**HOW DO I CARRYOVER THE EMPLOYMENT?

gen emp`w'_6dmin=emp_6d if `w'_6d != 0
replace emp`w'_6dmin=0 if `w'_6d ==0 

bysort countyid `indvar'_5d: egen emp`w'_5dmin_=total(emp`w'_6dmin)
replace emp`w'_5dmin_=. if emp_5d==.
gen emp`w'_5dmin=emp`w'_5dmin_
replace emp`w'_5dmin=max(emp`w'_5dmin_,emp_5d*(`w'_5d!=0))

bysort countyid `indvar'_4d: egen emp`w'_4dmin_=total(emp`w'_5dmin)
replace emp`w'_4dmin_=. if emp_4d==.
gen emp`w'_4dmin=emp`w'_4dmin_
replace emp`w'_4dmin=max(emp`w'_4dmin_,emp_4d*(`w'_4d!=0))

bysort countyid `indvar'_3d: egen emp`w'_3dmin_=total(emp`w'_4dmin)
replace emp`w'_3dmin_=. if emp_3d==.
gen emp`w'_3dmin=emp`w'_3dmin_
replace emp`w'_3dmin=max(emp`w'_3dmin_,emp_3d*(`w'_3d!=0))

bysort countyid `indvar'_2d: egen emp`w'_2dmin_=total(emp`w'_3dmin)
replace emp`w'_2dmin_=. if emp_2d==.
gen emp`w'_2dmin=emp`w'_2dmin_
replace emp`w'_2dmin=max(emp`w'_2dmin_,emp_2d*(`w'_2d!=0))

bysort countyid: egen `w'tot_0d0d = total(`w'_0d)

drop if `w'tot_0d0d==0


bysort countyid: egen `w'tot_2d=total(`w'_2dmin)
gen diff`w'_0d2d= `w'tot_0d0d- `w'tot_2d
sum diff`w'_0d2d


bysort countyid: egen emp_miss`w'_2d = total(emp_2d-emp`w'_2dmin) 
gen weight`w'_0d2d=((emp_2d-emp`w'_2dmin)/emp_miss`w'_2d) 
replace weight`w'_0d2d=0 if weight`w'_0d2d==.
replace `w'_2d=weight`w'_0d2d*diff`w'_0d2d+`w'_2dmin if `w'_2d !=. & `w'_2d==0

*  2 to 3, 3 to 4, 4 to 5 , 5 to 6

foreach v of numlist 2 3 4 5 {
local z=`v'+1

bysort countyid `indvar'_`v'd: egen `w'tot_`v'd`v'd = total(`w'_`v'd)
bysort countyid `indvar'_`v'd: egen `w'tot_`z'd=total(`w'_`z'dmin)
gen diff`w'_`v'd`z'd= `w'tot_`v'd`v'd- `w'tot_`z'd
sum diff`w'_`v'd`z'd


bysort countyid `indvar'_`v'd: egen emp_miss`w'_`z'd = total(emp_`z'd-emp`w'_`z'dmin) 
gen weight`w'_`v'd`z'd=((emp_`z'd-emp`w'_`z'dmin)/emp_miss`w'_`z'd) 
replace weight`w'_`v'd`z'd=0 if weight`w'_`v'd`z'd==.
replace `w'_`z'd=weight`w'_`v'd`z'd*diff`w'_`v'd`z'd+`w'_`z'dmin if `w'_`z'd !=. & `w'_`z'd==0
}

foreach v of numlist 2 3 4 5 6 {
replace `w'_new=`w'_`v'd if agg_level==`v'
}
}
}


*********************************************************
*********  IMPUTING QP1 AND AP for SIC ******************


if `year' >= 1988 & `year' <= 1997 {

	* Allocate aggregate industries recursively
	foreach v of numlist 3 2 1 0 {
		* Apart from a glitch in 1988, no need to allocate level-0 employment
		if `v' == 0 & `year' > 1988 {
			continue
		}
 foreach w in "emp"  {
		* Compute quantity of employment to be allocated, at the county x industry level
		bysort countyid sic_`v'd: egen alloc_`w' = total(`w' * (agg_level == `v')) if sic_`v'd != ""

		* Compute total value of employment in subordinate 4-digit industries, at the national level
		bysort sic_`v'd: egen level_`w' = total(`w' * (agg_level == 4)) if sic_`v'd != ""

		* Verify that recipient industries have positive employment
		assert level_`w' > 0 & level_`w' < . if sic_`v'd != ""

		* Compute each 4-digit industry's share of employment in the superordinate industry
		gen level_share_`w' = `w' / level_`w' if agg_level == 4

		* Allocate the outcome on the basis of shares
		replace `w' = `w' + (alloc_`w' * level_share_`w') if agg_level == 4

		* Zero out the industries that were just allocated
		replace `w' = 0 if agg_level == `v'

		* Cleanup
		drop alloc_`w' level_`w' level_share_`w'
		}
	}
	
	* No need to allocate employment at level-0
	assert agg_level == 4 if abs(emp) > 1
}


if "`indvar'" == "sic" {

replace emp=0 if agg_level != 4
replace emp=0 if emp<.01

gen empall=emp 
	
foreach v of numlist 0 1 2 3 4 {
bysort countyid `indvar'_`v'd: egen emptot_`v'd = total(emp) if `indvar'_`v'd != ""
replace empall= emptot_`v'd if agg_level==`v'
}

foreach v of numlist 0 1 2 3 4 {
gen emp_`v'd=emptot_`v'd if agg_level==`v'
replace emp_`v'd=emptot_`v'd if parent_level==`v'-1 & agg_level==`v'+3
replace emp_`v'd=emptot_`v'd if parent_level==`v'-1 & agg_level==`v'+2
replace emp_`v'd=emptot_`v'd if parent_level==`v'-1 & agg_level==`v'+1
replace emp_`v'd=emptot_`v'd if parent_level==`v'-2 & agg_level==`v'+1
replace emp_`v'd=emptot_`v'd if parent_level==`v'-3 & agg_level==`v'+1
}

destring `indvar'_1d, gen(`indvar'_1d_)
destring `indvar'_2d, gen(`indvar'_2d_)
destring `indvar'_3d, gen(`indvar'_3d_)
destring `indvar'_4d, gen(`indvar'_4d_)

bysort countyid `indvar'_3d: egen `indvar'4dmin=min(`indvar'_4d_)
replace `indvar'4dmin=. if agg_level==3
gen emp_3d_= emp_3d
replace emp_3d_=. if `indvar'_4d_!= `indvar'4dmin
replace emp_3d=emp_3d_
drop emp_3d_
gen emp_2d_= emp_2d
bysort countyid `indvar'_2d: egen `indvar'3dmin=min(`indvar'_3d_)
replace `indvar'3dmin=. if agg_level==2
replace emp_2d_=. if `indvar'_3d_!= `indvar'3dmin
replace emp_2d=emp_2d_
drop emp_2d_
gen emp_1d_= emp_1d
bysort countyid `indvar'_1d: egen `indvar'2dmin=min(`indvar'_2d_)
replace `indvar'2dmin=. if agg_level==1
replace emp_1d_=. if `indvar'_2d_!= `indvar'2dmin
replace emp_1d=emp_1d_
drop emp_1d_



foreach w in "ap" "qp1" {

*From 0 to 1

***ERRROR BECAUSE PARENT LEVEL MATTERS TOO!
foreach v of numlist 0 1 2 3 4 {
gen `w'_`v'd=`w' if agg_level==`v'
replace `w'_`v'd=`w' if parent_level==`v'-1 & agg_level==`v'+3
replace `w'_`v'd=`w' if parent_level==`v'-1 & agg_level==`v'+2
replace `w'_`v'd=`w' if parent_level==`v'-1 & agg_level==`v'+1
replace `w'_`v'd=`w' if parent_level==`v'-2 & agg_level==`v'+1
replace `w'_`v'd=`w' if parent_level==`v'-3 & agg_level==`v'+1
}

gen `w'_3d_= `w'_3d
replace `w'_3d_=. if `indvar'_4d_!= `indvar'4dmin
replace `w'_3d=`w'_3d_
drop `w'_3d_
gen `w'_2d_= `w'_2d
replace `w'_2d_=. if `indvar'_3d_!= `indvar'3dmin
replace `w'_2d=`w'_2d_
drop `w'_2d_
gen `w'_1d_= `w'_1d
replace `w'_1d_=. if `indvar'_2d_!= `indvar'2dmin
replace `w'_1d=`w'_1d_
drop `w'_1d_

gen `w'_4dmin=`w'_4d

bysort countyid `indvar'_3d: egen `w'_3dmin_=total(`w'_4d)
replace `w'_3dmin_=. if `w'_3d==.
gen `w'_3dmin=max(`w'_3dmin_,`w'_3d)
replace `w'_3d=`w'_3dmin if `w'_3d>0 & `w'_3d!=. & `w'_3d<`w'_3dmin

 
bysort countyid `indvar'_2d: egen `w'_2dmin_=total(`w'_3dmin)
replace `w'_2dmin_=. if `w'_2d==.
gen `w'_2dmin=max(`w'_2dmin_,`w'_2d)
replace `w'_2d=`w'_2dmin if `w'_2d>0 & `w'_2d!=. & `w'_2d<`w'_2dmin



bysort countyid `indvar'_1d: egen `w'_1dmin_=total(`w'_2dmin)
replace `w'_1dmin_=. if `w'_1d==.
gen `w'_1dmin=max(`w'_1dmin_,`w'_1d)
replace `w'_1d=`w'_1dmin if `w'_1d>0 & `w'_1d!=. & `w'_1d<`w'_1dmin



**HOW DO I CARRYOVER THE EMPLOYMENT?

gen emp`w'_4dmin=emp_4d if `w'_4d != 0
replace emp`w'_4dmin=0 if `w'_4d ==0 

bysort countyid `indvar'_3d: egen emp`w'_3dmin_=total(emp`w'_4dmin)
replace emp`w'_3dmin_=. if emp_3d==.
gen emp`w'_3dmin=emp`w'_3dmin_
replace emp`w'_3dmin=max(emp`w'_3dmin_,emp_3d*(`w'_3d!=0))

bysort countyid `indvar'_2d: egen emp`w'_2dmin_=total(emp`w'_3dmin)
replace emp`w'_2dmin_=. if emp_2d==.
gen emp`w'_2dmin=emp`w'_2dmin_
replace emp`w'_2dmin=max(emp`w'_2dmin_,emp_2d*(`w'_2d!=0))

bysort countyid `indvar'_1d: egen emp`w'_1dmin_=total(emp`w'_2dmin)
replace emp`w'_1dmin_=. if emp_1d==.
gen emp`w'_1dmin=emp`w'_1dmin_
replace emp`w'_1dmin=max(emp`w'_1dmin_,emp_1d*(`w'_1d!=0))



bysort countyid: egen `w'tot_0d0d = total(`w'_0d)

drop if `w'tot_0d0d==0


bysort countyid: egen `w'tot_1d=total(`w'_1dmin)
gen diff`w'_0d1d= `w'tot_0d0d- `w'tot_1d
sum diff`w'_0d1d


bysort countyid: egen emp_miss`w'_1d = total(emp_1d-emp`w'_1dmin) 
gen weight`w'_0d1d=((emp_1d-emp`w'_1dmin)/emp_miss`w'_1d) 
replace weight`w'_0d1d=0 if weight`w'_0d1d==.
replace `w'_1d=weight`w'_0d1d*diff`w'_0d1d+`w'_1dmin if `w'_1d !=. & `w'_1d==0

* 1 to 2, 2 to 3, 3 to 4

foreach v of numlist 1 2 3 {
local z=`v'+1

bysort countyid `indvar'_`v'd: egen `w'tot_`v'd`v'd = total(`w'_`v'd)
bysort countyid `indvar'_`v'd: egen `w'tot_`z'd=total(`w'_`z'dmin)
gen diff`w'_`v'd`z'd= `w'tot_`v'd`v'd- `w'tot_`z'd
sum diff`w'_`v'd`z'd


bysort countyid `indvar'_`v'd: egen emp_miss`w'_`z'd = total(emp_`z'd-emp`w'_`z'dmin) 
gen weight`w'_`v'd`z'd=((emp_`z'd-emp`w'_`z'dmin)/emp_miss`w'_`z'd) 
replace weight`w'_`v'd`z'd=0 if weight`w'_`v'd`z'd==.
replace `w'_`z'd=weight`w'_`v'd`z'd*diff`w'_`v'd`z'd+`w'_`z'dmin if `w'_`z'd !=. & `w'_`z'd==0
}

foreach v of numlist 1 2 3 4 {
replace `w'_new=`w'_`v'd if agg_level==`v'
}


}
}

********************************************************
*********   ENDS IMPUTING QP1 AND AP  ******************
	
drop ap qp1
ren ap_new ap
ren qp1_new qp1
	
	
	
	*******************************************************
	
	* Restrict to variables we need
	keep countyid `indvar' emp agg_level `indvar'_*d est ap qp1
	
	
	
		/* Map 2017 NAICS codes into 2012 NAICS codes */

	if `year' >= 2017 & `year' <= 2019 {
		* Before mapping 2017 NAICS into 2012 NAICS, first need to allocate all aggregate industries
		allocate_naics
		keep if agg_level == 6
		assert agg_level == 6
		
		* Observations at the 6-digit level are purely numeric
		destring naics, replace

		* Merge in 2012 NAICS codes for codes that changed; all other codes remain unaffected
		rename naics naics17
		joinby naics17 using "`inddir'`s'naics`s'naics17_naics12.dta", unmatched(master)
		replace naics12 = naics17 if _merge == 1
		replace weight = 1 if _merge == 1

		* Weights should sum to unity within each county x industry, up to floating point error
		bysort countyid naics17: egen tot_weight = total(weight)
		assert abs(1 - tot_weight) < .001

		* Collapse to the level of 2002 NAICS
		replace emp = emp * weight
		replace est=est*weight
		replace qp1=qp1*weight
		replace ap=ap*weight
		collapse (sum) emp est ap qp1, by(countyid naics12)
		rename naics12 naics
		tostring naics, replace
		
		* Recollapse to avoid duplicate industry codes
		collapse (sum) emp est ap qp1, by(countyid naics)

		* Merge in the 2012 NAICS hierarchy
		merge m:1 naics using "`nestingdir'`s'naics12_nesting.dta", assert(2 3)
		keep if _merge == 3
		drop _merge
	}
	
		/* Map 2012 NAICS codes into 2007 NAICS codes */

	if `year' >= 2012 & `year' <= 2019 {
		* Before mapping 2012 NAICS into 2007 NAICS, first need to allocate all aggregate industries
		allocate_naics
		keep if agg_level == 6
		assert agg_level == 6
		
		* Observations at the 6-digit level are purely numeric
		destring naics, replace

		* Merge in 2007 NAICS codes for codes that changed; all other codes remain unaffected
		rename naics naics12
		joinby naics12 using "`inddir'`s'naics`s'naics12_naics07.dta", unmatched(master)
		replace naics07 = naics12 if _merge == 1
		replace weight = 1 if _merge == 1

		* Weights should sum to unity within each county x industry, up to floating point error
		bysort countyid naics12: egen tot_weight = total(weight)
		assert abs(1 - tot_weight) < .001

		* Collapse to the level of 2007 NAICS
		replace emp = emp * weight
		replace est=est*weight
		replace qp1=qp1*weight
		replace ap=ap*weight
		collapse (sum) emp est ap qp1, by(countyid naics07)
		rename naics07 naics
		tostring naics, replace
		
		* Recollapse to avoid duplicate industry codes
		collapse (sum) emp est ap qp1, by(countyid naics)

		* Merge in the 2007 NAICS hierarchy
		merge m:1 naics using "`nestingdir'`s'naics07_nesting.dta", assert(2 3)
		keep if _merge == 3
		drop _merge
	}
	

	/* Map 2007 NAICS codes into 2002 NAICS codes */

	if `year' >= 2008 & `year' <= 2019 {
		* Before mapping 2007 NAICS into 2002 NAICS, first need to allocate all aggregate industries
		allocate_naics
		keep if agg_level == 6
		assert agg_level == 6
		
		* Observations at the 6-digit level are purely numeric
		destring naics, replace

		* Merge in 2002 NAICS codes for codes that changed; all other codes remain unaffected
		rename naics naics07
		joinby naics07 using "`inddir'`s'naics`s'naics07_naics02.dta", unmatched(master)
		replace naics02 = naics07 if _merge == 1
		replace weight = 1 if _merge == 1

		* Weights should sum to unity within each county x industry, up to floating point error
		bysort countyid naics07: egen tot_weight = total(weight)
		assert abs(1 - tot_weight) < .001

		* Collapse to the level of 2002 NAICS
		replace emp = emp * weight
		replace est=est*weight
		replace qp1=qp1*weight
		replace ap=ap*weight
		collapse (sum) emp est ap qp1, by(countyid naics02)
		rename naics02 naics
		tostring naics, replace
		
		* Recollapse to avoid duplicate industry codes
		collapse (sum) emp est ap qp1, by(countyid naics)

		* Merge in the 2002 NAICS hierarchy
		merge m:1 naics using "`nestingdir'`s'naics02_nesting.dta", assert(2 3)
		keep if _merge == 3
		drop _merge
	}

	/* Map 2002 NAICS codes into 1997 NAICS codes. */
	
	if `year' >= 2003 & `year' <= 2019 {
		* Before mapping 2002 NAICS into 1997 NAICS, first need to allocate all aggregate industries
		allocate_naics
		keep if agg_level == 6
		assert agg_level == 6

		* Observations at the 6-digit level are purely numeric
		destring naics, replace
		
		* Modify industry codes as needed to ensure proper mapping to 1997 codes
		replace naics = 425100 if naics == 425110	/* Business to business electronic markets */
		replace naics = 425100 if naics == 425120	/* Wholesale trade agents and brokers */
		collapse (sum) emp est ap qp1, by(countyid naics)

		* Merge in 1997 NAICS codes for codes that changed; all other codes remain unaffected
		rename naics naics02
		joinby naics02 using "`inddir'`s'naics`s'naics02_naics97.dta", unmatched(master)
		replace naics97 = naics02 if _merge == 1
		replace weight = 1 if _merge == 1

		* Weights should sum to unity within each county x industry, up to floating point error
		bysort countyid naics02: egen tot_weight = total(weight)
		assert abs(1 - tot_weight) < .001

		* Collapse to the level of 1997 NAICS
		replace emp = emp * weight
		replace est=est*weight
		replace qp1=qp1*weight
		replace ap=ap*weight
		collapse (sum) emp est ap qp1, by(countyid naics97)
		rename naics97 naics
		tostring naics, replace
		
		* Modify industry codes that aren't properly handled by the official Census crosswalk
		replace naics = "233110" if naics == "237210"	/* Land subdivision */
		replace naics = "235920" if naics == "238150"	/* Glass and glazing contractors */
		replace naics = "235520" if naics == "238330"	/* Flooring contractors */
		replace naics = "235430" if naics == "238340"	/* Tile and Terrazzo Contractors */
		replace naics = "454110" if naics == "454112"	/* Electronic auctions */

		* Recollapse to avoid duplicate industry codes
		collapse (sum) emp est ap qp1, by(countyid naics)

		* Merge in the 1997 NAICS hierarchy
		merge m:1 naics using "`nestingdir'`s'naics97_nesting.dta", assert(2 3)
		keep if _merge == 3
		drop _merge
	}
	
	/* Map 1997 NAICS codes into 1987 SIC codes */

	if `year' >= 1998 & `year' <= 2019 {
		* Before mapping NAICS into SIC, first need to allocate all aggregate industries
		allocate_naics
		keep if agg_level == 6
		assert agg_level == 6

		* Observations at the 6-digit level are purely numeric
		destring naics, replace

		* Merge in SIC87 codes
		joinby naics using "`inddir'`s'naics97_sic87`s'naics97_sic87.dta", unmatched(both) _merge(naics_merge)
	
		* All NAICS industry codes should be accounted for
		assert naics_merge == 2 | naics_merge == 3
		keep if naics_merge == 3
		drop naics_merge
		
		* Weights should sum to unity within each county x industry, up to floating point error
		bysort countyid naics: egen tot_weight = total(weight)
		assert abs(1 - tot_weight) < .001

		* Drop employment in "allocated" auxiliary industries (i.e., outside the designated auxiliary codes)
		foreach ss in 10001 20001 30001 40001 50001 60001 70001 80001 90001 {
			drop if sic == `ss'
		}
		
		* Collapse to the SIC87 level
		replace emp = emp * weight
		replace est=est*weight
		replace qp1=qp1*weight
		replace ap=ap*weight
		
		collapse (sum) emp est ap qp1, by(countyid sic)
		
		* Convert back to string, since for years with native SIC codes, sic is a string at this point
		tostring sic, replace
		replace sic = "0" + sic if length(sic) == 3
		assert length(sic) == 4
		
		* Merge in the SIC hierarchy
		merge m:1 sic using "`nestingdir'`s'sic87_nesting.dta", assert(2 3)
		keep if _merge == 3
		drop _merge

		* Rename for symmetry with NAICS variable names
		forvalues i = 0/4 {
			rename sic87_`i'd sic_`i'd
		}

		* Exclude auxiliary employment as well as unclassified establishments
		drop if auxiliary == 1
		drop if sic == "99--"		
	}
	
	/*
		Aggregate SIC industries as needed to ensure balanced representation.
		Most aggregations are performed by the script sic87_sic87dd.do.
		Here, I perform only those aggregations needed to ensure proper functioning of this cleaning code.
	*/
	
	* Combine "Farm labor contractors" with "Farm management services".
	* Need to do this because, starting in 1998, the codes subordinate to 0760 have no employment.
	replace sic = "0760" if sic == "0761" | sic == "0762"

	bysort countyid: egen emp_sum = total(emp * (sic == "0760"))
	bysort countyid: egen est_sum = total(est * (sic == "0760"))
	bysort countyid: egen qp1_sum = total(qp1 * (sic == "0760"))
	bysort countyid: egen ap_sum = total(ap * (sic == "0760"))
	
	replace emp = emp_sum if sic == "0760"
	replace est = est_sum if sic == "0760"
	replace qp1 = qp1_sum if sic == "0760"
	replace ap = ap_sum if sic == "0760"
	
	drop emp_sum
	drop est_sum
	drop qp1_sum
	drop ap_sum

	drop if sic == "0760" & sic_4d != ""
	replace agg_level = 4 if sic == "0760"
	replace sic_4d = "0760" if sic == "0760"
	bysort countyid sic: assert _N == 1

	/*
		Recursively allocate employment in aggregate industries.
		I proportionally allocate -local- employment on the basis of -national- employment shares.
	*/

	* Restrict to the variables we need
	keep countyid sic emp est ap qp1 sic_*d agg_level
	tempfile pre_apportionment
	save "`pre_apportionment'.dta", replace

	* Obtain a list of 4-digit industries
	keep sic sic_*d agg_level
	duplicates drop
	bysort sic: assert _N == 1
	tempfile indlist
	save "`indlist'.dta", replace
	
	* Create observations for missing county-industry pairs
	use "`pre_apportionment'.dta", clear
	keep countyid sic emp est ap qp1
	fillin countyid sic
	replace emp = 0 if _fillin == 1
	replace est = 0 if _fillin == 1
	drop _fillin
	*For year 2015
	replace emp=0 if emp<0 
	assert emp >= 0 & emp < .
	merge m:1 sic using "`indlist'.dta", assert(3) nogenerate

	* Allocate aggregate industries recursively
	foreach v of numlist 3 2 1 0 {
		* Apart from a glitch in 1988, no need to allocate level-0 employment
		if `v' == 0 & `year' > 1988 {
			continue
		}

		* Compute quantity of employment to be allocated, at the county x industry level
		bysort countyid sic_`v'd: egen alloc_emp = total(emp * (agg_level == `v')) if sic_`v'd != ""

		* Compute total value of employment in subordinate 4-digit industries, at the national level
		bysort sic_`v'd: egen level_emp = total(emp * (agg_level == 4)) if sic_`v'd != ""

		* Verify that recipient industries have positive employment
		assert level_emp > 0 & level_emp < . if sic_`v'd != ""

		* Compute each 4-digit industry's share of employment in the superordinate industry
		gen level_share = emp / level_emp if agg_level == 4

		* Allocate the outcome on the basis of shares
		replace emp = emp + (alloc_emp * level_share) if agg_level == 4

		* Zero out the industries that were just allocated
		replace emp = 0 if agg_level == `v'

		* Cleanup
		drop alloc_emp level_emp level_share
	}
	
	* No need to allocate employment at level-0
	assert agg_level == 4 if abs(emp) > 1
	keep if agg_level == 4

	* Restrict to the variables we need
	keep countyid sic emp est ap qp1
	gen year = `year'
	order year countyid sic emp est ap qp1

	/* Map 1987 SIC codes into sic87dd codes and collapse to aggregate combined observations */

	* Map to sic87dd codes
	destring sic, replace	
	rename sic sic87
	do "`inddir'`s'sic87_sic87dd`s'sic87_sic87dd.do"

	collapse (sum) emp est ap qp1, by(year countyid sic87dd)

	/* Aggregate non-manufacturing industries to a level that can be linked to the I/O matrix */

	gen manuf = (sic87dd >= 2000 & sic87dd <= 3999)
	merge m:1 sic87dd using "`inddir'`s'sic2io`s'sic87dd_to_sic87dd_pooled.dta", keep(1 3) keepusing(sic87dd_pooled)
	assert _merge == 1 if manuf == 1
	assert _merge == 3 if manuf == 0
	drop _merge

	replace sic87dd = sic87dd_pooled if manuf == 0
	bysort sic87dd (manuf): assert manuf[1] == manuf[_N]
	drop sic87dd_pooled

	collapse (sum) emp est ap qp1, by(year countyid sic87dd)

	rename countyid cty_fips
    compress
	
	/*FIPS code change in South Dakota 46113 to 46102 in 2010 (change to old). See:
	https://www.ddorn.net/data/FIPS_County_Code_Changes.pdf
	*/
    replace cty_fips=46113 if cty_fips==46102
	
	
	* Save COUNTY LEVEL file
	
	save "`cbpdir'`s'cbp_county_`year'.dta", replace

	
	merge m:1 cty_fips using "`czonedir'`s'cty_czone_state_boundaries.dta", keep(1 3) generate(czone_merge)


	* Drop Alaska (FIPS state code 2, CZs 34101 to 34115)
	assert cty_fips >= 2000 & cty_fips <= 2999 if czone_merge == 1
	assert cty_fips >= 2000 & cty_fips <= 2999 if czone >= 34101 & czone <= 34115
	assert (czone >= 34101 & czone <= 34115) | (czone == .) if (cty_fips >= 2000 & cty_fips <= 2000)
	drop if cty_fips >= 2000 & cty_fips <= 2999

	* Drop Hawaii (FIPS state code 15, CZs 35600 and 34701-34703)
	assert cty_fips >= 15000 & cty_fips <= 15999 if czone == 35600 | (czone >= 34701 & czone <= 34703)
	drop if czone == 35600 | (czone >= 34701 & czone <= 34703)

	compress 
	
	collapse (sum) emp est ap qp1 (first) state_min state_max multi count three, by(year czone state sic87dd)

	compress
	* Save cleaned data at the commuting zone-state x sic87dd industry level
	save "`cbpdir'`s'cbp_czone_state_`year'.dta", replace

}


/* Combine annual extracts to get a single file */

*CZONE-STATE

use "`cbpdir'`s'cbp_czone_state_1990.dta", clear
compress
append using "`cbpdir'`s'cbp_czone_state_1991.dta"
compress
append using "`cbpdir'`s'cbp_czone_state_1992.dta"
compress
append using "`cbpdir'`s'cbp_czone_state_1993.dta"
compress
append using "`cbpdir'`s'cbp_czone_state_1994.dta"
compress
append using "`cbpdir'`s'cbp_czone_state_1995.dta"
compress
append using "`cbpdir'`s'cbp_czone_state_1996.dta"
compress
append using "`cbpdir'`s'cbp_czone_state_1997.dta"
compress
append using "`cbpdir'`s'cbp_czone_state_1998.dta"
compress
append using "`cbpdir'`s'cbp_czone_state_1999.dta"
compress
append using "`cbpdir'`s'cbp_czone_state_2000.dta"
compress
append using "`cbpdir'`s'cbp_czone_state_2001.dta"
compress
append using "`cbpdir'`s'cbp_czone_state_2002.dta"
compress
append using "`cbpdir'`s'cbp_czone_state_2003.dta"
compress
append using "`cbpdir'`s'cbp_czone_state_2004.dta"
compress
append using "`cbpdir'`s'cbp_czone_state_2005.dta"
compress
append using "`cbpdir'`s'cbp_czone_state_2006.dta"
compress
append using "`cbpdir'`s'cbp_czone_state_2007.dta"
compress
append using "`cbpdir'`s'cbp_czone_state_2008.dta"
compress
append using "`cbpdir'`s'cbp_czone_state_2009.dta"
compress
append using "`cbpdir'`s'cbp_czone_state_2010.dta"
compress
append using "`cbpdir'`s'cbp_czone_state_2011.dta"
compress
append using "`cbpdir'`s'cbp_czone_state_2012.dta"
compress
append using "`cbpdir'`s'cbp_czone_state_2013.dta"
compress
append using "`cbpdir'`s'cbp_czone_state_2014.dta"
compress
append using "`cbpdir'`s'cbp_czone_state_2015.dta"
compress
append using "`cbpdir'`s'cbp_czone_state_2016.dta"
compress
save "`cbpdir'`s'cbp_czone_state_1990_2016.dta", replace

*COUNTY

use "`cbpdir'`s'cbp_county_1990.dta", clear
compress
append using "`cbpdir'`s'cbp_county_1991.dta"
compress
append using "`cbpdir'`s'cbp_county_1992.dta"
compress
append using "`cbpdir'`s'cbp_county_1993.dta"
compress
append using "`cbpdir'`s'cbp_county_1994.dta"
compress
append using "`cbpdir'`s'cbp_county_1995.dta"
compress
append using "`cbpdir'`s'cbp_county_1996.dta"
compress
append using "`cbpdir'`s'cbp_county_1997.dta"
compress
append using "`cbpdir'`s'cbp_county_1998.dta"
compress
append using "`cbpdir'`s'cbp_county_1999.dta"
compress
append using "`cbpdir'`s'cbp_county_2000.dta"
compress
append using "`cbpdir'`s'cbp_county_2001.dta"
compress
append using "`cbpdir'`s'cbp_county_2002.dta"
compress
append using "`cbpdir'`s'cbp_county_2003.dta"
compress
append using "`cbpdir'`s'cbp_county_2004.dta"
compress
append using "`cbpdir'`s'cbp_county_2005.dta"
compress
append using "`cbpdir'`s'cbp_county_2006.dta"
compress
append using "`cbpdir'`s'cbp_county_2007.dta"
compress
append using "`cbpdir'`s'cbp_county_2008.dta"
compress
append using "`cbpdir'`s'cbp_county_2009.dta"
compress
append using "`cbpdir'`s'cbp_county_2010.dta"
compress
append using "`cbpdir'`s'cbp_county_2011.dta"
compress
append using "`cbpdir'`s'cbp_county_2012.dta"
compress
append using "`cbpdir'`s'cbp_county_2013.dta"
compress
append using "`cbpdir'`s'cbp_county_2014.dta"
compress
append using "`cbpdir'`s'cbp_county_2015.dta"
compress
append using "`cbpdir'`s'cbp_county_2016.dta"
compress
save "`cbpdir'`s'cbp_county_1990_2016.dta", replace

forvalues year = 1990/2016 {
	rm "`cbpdir'`s'cbp_county_`year'.dta"
	rm "`cbpdir'`s'cbp_czone_state_`year'.dta"
}


